import spacy
import pandas as pd
from tqdm.auto import tqdm
import swifter
import plotly.express as px
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import textacy
from collections import Counter
import random
import os
import pickle
from matplotlib.pyplot import figure
from pathlib import Path
import ast
pd.options.plotting.backend = "plotly"
random.seed(123)
def cloud_from_lemmas(word_counts):
wc = WordCloud(width=800, height=400)
wc.generate_from_frequencies(frequencies=word_counts)
plt.figure(figsize=(10,8))
plt.imshow(wc)
def plot_counts(counts):
fig = px.bar(counts,orientation='h', y='word', x='count')
fig['layout']['yaxis']['autorange'] = "reversed"
fig.update_layout(bargap=0.30, font={'size':10})
return fig
en = spacy.load("en_core_web_sm")
mar02 = pd.read_csv("./first_week/02_mar.csv")
mar01 = pd.read_csv("./first_week/01_mar.csv")
feb28 = pd.read_csv("./first_week/28_feb.csv")
feb27 = pd.read_csv("./first_week/27_feb.csv")
feb26 = pd.read_csv("./first_week/26_feb.csv")
feb25 = pd.read_csv("./first_week/25_feb.csv")
feb24 = pd.read_csv("./first_week/24_feb.csv")
print(len(feb24))
feb24 = feb24.iloc[:48001]
l = [mar02, mar01, feb28, feb27, feb26, feb25, feb24]
1468659
for x in l:
print(len(x))
48001 48001 48001 48001 48001 48001 48001
feb28
| Unnamed: 0 | Datetime | Tweet Id | Text | Username | Replies Count | Retweets Count | Likes Count | Quotes Count | Language | Retweeted Tweet | Quoted Tweet | Mentioned Users | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2022-02-28 23:59:59+00:00 | 1498447925306744832 | Inside Volodymyr Velenskyy's rise to President... | ONEAMERICANMAD1 | 0 | 0 | 0 | 0 | en | NaN | NaN | NaN |
| 1 | 1 | 2022-02-28 23:59:59+00:00 | 1498447925080309762 | It's a summer like day, 70F. I'm sitting outsi... | TPBookSeries | 1 | 0 | 1 | 0 | en | NaN | NaN | NaN |
| 2 | 2 | 2022-02-28 23:59:59+00:00 | 1498447923973079044 | @wallaceme @GBNEWS It’s more deliberately pern... | Ed_Owen | 0 | 0 | 4 | 0 | en | NaN | NaN | ['wallaceme', 'GBNEWS'] |
| 3 | 3 | 2022-02-28 23:59:59+00:00 | 1498447923784343563 | Although the Australian Government has asked c... | CommsroomC | 0 | 0 | 1 | 0 | en | NaN | NaN | NaN |
| 4 | 4 | 2022-02-28 23:59:59+00:00 | 1498447923776045060 | Although @ausgov has asked citizens not to tra... | PublicSpectrum | 0 | 1 | 2 | 0 | en | NaN | NaN | ['ausgov'] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47996 | 47996 | 2022-02-28 22:26:02+00:00 | 1498424278756200448 | @barnes_law The Russians estimated military bu... | Bstokesss | 1 | 0 | 11 | 0 | en | NaN | NaN | ['barnes_law'] |
| 47997 | 47997 | 2022-02-28 22:26:02+00:00 | 1498424278739345409 | Final Rule Adds Sweeping Restrictions on Expor... | LawAnalysis | 0 | 0 | 0 | 0 | en | NaN | NaN | ['mstockbridgelaw'] |
| 47998 | 47998 | 2022-02-28 22:26:01+00:00 | 1498424277451911168 | I’d have my animals with me too. #Ukraine http... | katya_kowal | 0 | 0 | 1 | 0 | en | NaN | https://twitter.com/LorenzoTheCat/status/14983... | NaN |
| 47999 | 47999 | 2022-02-28 22:26:01+00:00 | 1498424277393149958 | Gadget Game News : Elon Musk’s promised Starli... | kalpak_savaliya | 0 | 0 | 1 | 0 | en | NaN | NaN | NaN |
| 48000 | 48000 | 2022-02-28 22:26:01+00:00 | 1498424277367881733 | @Albert_Nobbs ARE WE HAPPY NOW, ALBERTA? CAN O... | BanFoolish | 1 | 0 | 4 | 0 | en | NaN | NaN | ['Albert_Nobbs'] |
48001 rows × 13 columns
len(feb24)
48001
df = pd.concat(l)
df
| Unnamed: 0 | Datetime | Tweet Id | Text | Username | Replies Count | Retweets Count | Likes Count | Quotes Count | Language | Retweeted Tweet | Quoted Tweet | Mentioned Users | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2022-03-02 23:59:59+00:00 | 1499172701289828352 | In solidarity with Ukraine it might be time to... | adinplore | 0 | 0 | 0 | 0 | en | NaN | NaN | NaN |
| 1 | 1 | 2022-03-02 23:59:59+00:00 | 1499172700924911621 | First Ukraine City Falls as Russia Strikes Mor... | JanatakhabarP | 0 | 0 | 0 | 0 | en | NaN | NaN | NaN |
| 2 | 2 | 2022-03-02 23:59:59+00:00 | 1499172700157263872 | There is a good @RealLifeLore22 video on this.... | Liam_Holman99 | 0 | 2 | 2 | 0 | en | NaN | https://twitter.com/dontbrexitfixit/status/149... | ['RealLifeLore22'] |
| 3 | 3 | 2022-03-02 23:59:59+00:00 | 1499172699003838468 | Invading Ukraine was a choice. | TELEXDesignCo | 0 | 0 | 0 | 0 | en | NaN | NaN | NaN |
| 4 | 4 | 2022-03-02 23:59:59+00:00 | 1499172698978762755 | @GGraczka @Ukraine That's not what they're say... | SiJeDisTakbir | 1 | 0 | 0 | 0 | en | NaN | NaN | ['GGraczka', 'Ukraine'] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47996 | 47996 | 2022-02-24 23:00:43+00:00 | 1496983457381105671 | @niezbyt_babo @Formaela19 @lovingthemwomen @Pe... | aartemidee | 0 | 0 | 1 | 0 | en | NaN | NaN | ['niezbyt_babo', 'Formaela19', 'lovingthemwome... |
| 47997 | 47997 | 2022-02-24 23:00:43+00:00 | 1496983457377001484 | @katramdeen Holy shit Katherine, i totally for... | RonnyS0L | 1 | 0 | 0 | 0 | en | NaN | NaN | ['katramdeen'] |
| 47998 | 47998 | 2022-02-24 23:00:43+00:00 | 1496983456877797380 | @BigBluexlt Coups a comin. Certain G8s want Zs... | USAMRIID | 1 | 3 | 9 | 1 | en | NaN | NaN | ['BigBluexlt'] |
| 47999 | 47999 | 2022-02-24 23:00:43+00:00 | 1496983456697491465 | @heartshapedwomb right lmao i don't think the ... | na74362408 | 2 | 0 | 2 | 0 | en | NaN | NaN | ['heartshapedwomb'] |
| 48000 | 48000 | 2022-02-24 23:00:43+00:00 | 1496983456479334400 | @hutchinson People are so fucking goddamn stup... | Banshee__Baby | 0 | 1 | 0 | 0 | en | NaN | NaN | ['hutchinson'] |
336007 rows × 13 columns
mar02 = mar02.iloc[:20000]
df = mar02
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19981 [00:00<?, ?it/s]
mar02 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
mar02['lemmas'] = mar02.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
mar02_word_counts = Counter(mar02.lemmas.sum())
mar02_counts = pd.DataFrame(Counter({k: v for k, v in mar02_word_counts.items()}).most_common(60), columns=['word', 'count'])
cloud_from_lemmas(mar02_word_counts)
plot_counts(mar02_counts)
mar02_word_counts
Counter({'solidarity': 99,
'Ukraine': 20145,
'time': 669,
'remove': 92,
'color': 34,
'white': 127,
'blue': 65,
'red': 43,
'thing': 455,
'represent': 31,
'russian': 2603,
'flag': 138,
'City': 45,
'Falls': 20,
'Russia': 5432,
'strike': 95,
'civilian': 595,
'\xa0': 568,
'target': 194,
'https://t.co/It0d0ZMv9E': 1,
'good': 602,
'@RealLifeLore22': 1,
'video': 225,
'Putin': 3246,
'want': 1124,
'act': 160,
'russophilic': 1,
'buffer': 13,
'zone': 130,
'oil': 239,
'field': 29,
'strategic': 48,
'advantage': 12,
'https://t.co/jppja9ogmG': 1,
'Invading': 15,
'choice': 77,
'@GGraczka': 1,
'@Ukraine': 477,
'say': 1197,
'share': 153,
'testimony': 5,
'one': 79,
'commit': 182,
'crime': 488,
'punish': 44,
'death': 341,
'penalty': 10,
'type': 39,
'@zoltarssg': 1,
'@orwellandrade': 1,
'@PredictIt': 1,
'announce': 80,
'@SSGamblers': 1,
'discord': 5,
'dump': 8,
'trade': 43,
'30': 61,
'probably': 157,
'bad': 392,
'bet': 41,
'place': 215,
'mean': 343,
'obvious': 35,
'recommend': 19,
'pod': 1,
'invasion': 1399,
'shelling': 46,
'news': 395,
'report': 291,
'@POTUS': 257,
'rewrite': 4,
'speech': 64,
'Israel': 149,
'quiet': 18,
'think': 985,
'going': 91,
'to': 113,
'appear': 63,
'move': 84,
'tactical': 14,
'battlefield': 11,
'nuke': 153,
'close': 224,
'bode': 1,
'desperate': 27,
'people': 2271,
'action': 219,
'leave': 403,
'cornered': 3,
'RAT': 2,
'space': 56,
'run': 157,
'prepared': 16,
'kill': 585,
'https://t.co/gcB6vsj4ek': 1,
'Victoria': 22,
'Secret': 17,
'VSCO': 8,
'Q4': 8,
'2021': 32,
'earning': 12,
'issue': 181,
'weak': 69,
'outlook': 13,
'quote': 36,
'https://t.co/rXHO3HWBkP': 1,
'@usaldak47': 1,
'@Israel': 20,
'90': 16,
'year': 469,
'travel': 55,
'past': 106,
'😥': 18,
'@blunomatterhoo': 1,
'find': 241,
'try': 512,
'assimilation': 1,
'idea': 122,
"didn't": 5,
'will': 260,
'man': 329,
'Toronto': 7,
'ground': 128,
'Poland': 280,
'help': 1225,
'refugee': 298,
'direct': 56,
'Winnipeg': 2,
'group': 139,
'collect': 45,
'1000': 19,
'kg': 1,
'humanitarian': 273,
'aid': 262,
'radio': 23,
'diaper': 7,
'Canada': 154,
'@nuttallreport': 1,
'@OmarMosleh': 1,
'https://t.co/MiIbGFp6QR': 1,
'Lady': 7,
'take': 546,
'stand': 638,
'amid': 117,
'Invasion': 138,
'Panic': 2,
'Tears': 1,
'People': 147,
'https://t.co/pjxrl4lq8l': 1,
'@da_cheese': 1,
'@redfishstream': 1,
'journalist': 46,
'invade': 869,
'ukraine': 974,
'😱': 15,
'St.': 22,
'Javelin': 9,
'vs': 60,
'Improvised': 2,
'Armor': 1,
'Tanks': 6,
'|': 360,
'sofrep': 1,
'https://t.co/x5agxv1zvq': 1,
'fuck': 147,
'prayer': 120,
'UkraineRussianWar': 60,
'RussiaUkraine': 96,
'banrussiafromswift': 3,
'CancelRussia': 4,
'DefeatPutin': 7,
'DefendUkraine': 13,
'FckPutin': 9,
'FreeUkraine': 14,
'https://t.co/vfhnqq0jxr': 1,
'@joncoopertweet': 27,
'@gop': 12,
'lead': 196,
'Trump': 425,
'blackmail': 27,
'vote': 505,
'withhold': 39,
'support': 1405,
'make': 285,
'@DicksonReps': 1,
'@murpharoo': 1,
'@joshbutler': 1,
'@msmarto': 1,
'wow': 47,
'west': 190,
'atrocity': 57,
'ignore': 72,
'attack': 578,
'obviously': 57,
'lot': 229,
'war': 3024,
'definitely': 62,
'particular': 22,
'get': 463,
'@ksidiii': 1,
'standard': 29,
'deviation': 1,
'price': 156,
'combine': 10,
'straight': 40,
'day': 521,
'trading': 6,
'10': 89,
'range': 17,
'month': 114,
'like': 1173,
'penny': 10,
'stock': 67,
'embargo': 12,
'wipe': 35,
'entire': 82,
'week': 288,
'gain': 46,
'resolution': 237,
'\u2066@nytimes\u2069': 3,
'\u2066@doctecazoid\u2069': 1,
'right': 771,
'wait': 145,
'Olympics': 151,
'https://t.co/pjnszazykh': 1,
'conflict': 387,
'put': 87,
'cyber': 13,
'warfare': 25,
'center': 30,
'-Host': 1,
'@tteminWFED': 1,
'@markcmontgomery': 1,
'@FDD_CCTI': 1,
' \n\n': 54,
'https://t.co/dugszt1h5d': 1,
'@FederalNewsNet': 1,
'customer': 9,
'ask': 419,
'ukrainian': 669,
'decal': 1,
'2': 325,
'Tryzub': 2,
'1': 290,
'great': 229,
'grandfather': 8,
'terrible': 40,
'happen': 562,
'standwithukraine': 32,
'https://t.co/t1ZW6pBPbn': 1,
'@seanhannity': 7,
'go': 920,
'sad': 115,
'leader': 222,
'assassinate': 17,
'world': 1031,
'cry': 55,
'sick': 30,
'feeling': 23,
'feel': 296,
'fall': 229,
'save': 172,
'😢': 33,
'ukrainerussiawar': 267,
'dead': 139,
'soldier': 416,
'near': 105,
'Kherson': 161,
'warning': 22,
'GRAPHIC': 2,
'UkraineInvasion': 39,
'SlavaUkraini': 37,
'russianukrainianwar': 161,
'https://t.co/yBXJCcZBAd': 1,
'@McFaul': 59,
'mad': 37,
'simply': 55,
'bear': 77,
'wind': 9,
'hold': 249,
'card': 19,
'i.e.': 7,
'know': 1032,
'дурак': 1,
'fool': 29,
'stupid': 80,
'etc': 134,
'\n ': 192,
'cf': 1,
'eat': 30,
'Queen': 3,
'Spades': 2,
'fail': 100,
'bid': 21,
'shoot': 81,
'moon': 1,
'https://t.co/gnwr1eoizz': 1,
'@zebedy1997': 2,
'@ffschristie': 1,
'@seanzjay': 4,
'@yimbychris': 6,
'@jeremycorbyn': 27,
'@stwuk': 24,
'forget': 133,
'state': 326,
'talk': 408,
'future': 114,
'fear': 112,
'direction': 23,
'course': 84,
'Europe': 404,
'look': 427,
'hate': 114,
'Libya': 42,
'Afghanistan': 170,
'tragedy': 30,
'far': 221,
'sadly': 32,
'@sharonlwa': 1,
'44': 11,
'm': 122,
'7b': 1,
'Earth': 14,
'european': 136,
'NATO': 1282,
'nuclear': 333,
'6,956,000,000': 1,
'horrible': 43,
'situation': 281,
'heart': 175,
'break': 166,
'@music__bee': 1,
'pshower': 1,
'expect': 103,
'God': 182,
'Bless': 13,
'unholy': 1,
'sean': 1,
'mug': 3,
'design': 18,
'https://t.co/xwhusp2lb2': 1,
'https://t.co/km7o2xlz3c': 1,
'@geofflath': 2,
'@bjoneslaw1972': 4,
'@Osinttechnical': 5,
'Russians': 571,
'Ukrainians': 374,
'advice': 15,
'intent': 12,
'push': 139,
'propaganda': 221,
'social': 93,
'medium': 315,
'insurgency': 12,
'care': 295,
'interested': 21,
'handle': 25,
'ally': 130,
'responsible': 49,
'crisis': 262,
'Dr.': 13,
'John': 41,
'J.': 2,
'Mearsheimer': 25,
'important': 127,
'understand': 271,
'West': 243,
'2008': 23,
'turn': 191,
'western': 256,
'bulwark': 2,
'border': 331,
'https://t.co/lrsqefjte7': 1,
'@3lidw': 2,
'@mod_russia': 12,
'believe': 318,
'win': 246,
'come': 546,
'strong': 161,
'Union': 143,
'new': 222,
'friendly': 23,
'member': 243,
'tell': 484,
'school': 67,
'ScoonTv': 4,
'https://t.co/7ojjcny5fr': 1,
'citizen': 254,
'absolutely': 95,
'eviscerate': 2,
'MSM': 29,
'libtard': 1,
'virtue': 25,
'signal': 30,
'idiotic': 5,
'family': 313,
'friend': 206,
'https://t.co/mUcZm7RRKA': 1,
'add': 133,
'narrative': 46,
'Biden': 600,
'Divert': 1,
'Agents': 1,
'U.S.': 247,
'Southern': 14,
'Border': 10,
'send': 659,
'Assist': 1,
'Conflict': 22,
'https://t.co/t3p8LXPERg': 1,
'@pamelageller': 1,
'@cesc_james': 1,
'@GalekNaughty': 4,
'@spectatorindex': 46,
'army': 209,
'armoured': 5,
'vehicle': 61,
'Iraq': 164,
'2003': 13,
'Google': 28,
'participate': 16,
'link': 56,
'answer': 79,
'https://t.co/GdIKv67g78': 1,
'@DreamLeaf5': 4,
'@RaccoonLeandro': 3,
'Right': 13,
'misremember': 1,
'specific': 20,
'sure': 224,
'agree': 194,
'appeasement': 12,
'short': 58,
'troop': 319,
'immediately': 79,
'benefit': 74,
'tribe': 2,
'settle': 24,
'Dnieper': 5,
'river': 9,
'Belarus': 159,
'North': 48,
'spread': 71,
'northward': 1,
'northern': 7,
'Volga': 1,
'valley': 1,
'east': 53,
'modern': 25,
'Moscow': 131,
'basin': 1,
'Dniester': 1,
'Buh': 1,
'present': 41,
'southern': 34,
'head': 149,
'Golda': 1,
'Meir': 1,
'watch': 354,
'Munich': 9,
'https://t.co/arkswwBtNw': 1,
'nerve': 10,
'pro': 119,
'call': 311,
'fascist': 60,
'remember': 163,
'flight': 51,
'MH17': 6,
'separatist': 27,
'hide': 61,
'fact': 195,
'slaughter': 54,
'nearly': 61,
'300': 13,
'international': 143,
'Jesus': 18,
'Украина': 5,
'РоссияСмотри': 1,
'https://t.co/lq27ie1khm': 1,
'guy': 225,
'fight': 788,
'Moldova': 44,
'https://t.co/mussysigqc': 1,
'Iryna': 2,
'Red': 29,
'Cross': 18,
'Fund': 12,
'https://t.co/ds6bbiwd1e': 1,
'@crist_aras': 2,
'u': 144,
'well': 217,
'reason': 224,
'terrorist': 61,
'@dbrand': 1,
'@Mrwhosetheboss': 1,
'@MKBHD': 2,
's': 89,
'beg': 26,
'prize': 4,
'punishable': 3,
'worst': 4,
'frontline': 7,
'🤣': 100,
'😂': 176,
'Street': 18,
'combat': 41,
'minute': 62,
'territorial': 27,
'defense': 84,
'liberate': 19,
'city': 370,
'russiaukrainewar': 18,
'explain': 111,
'https://t.co/jjymqnb7hz': 1,
'@youtube': 195,
'@nikkihaley': 3,
'@repspartz': 4,
'hell': 95,
'@europeanpan': 1,
'self': 68,
'hater': 1,
'love': 268,
'influence': 42,
'NYT': 19,
'China': 473,
'delay': 73,
'winter': 24,
'crazy': 62,
'squeeze': 9,
'$': 343,
'Korea': 57,
'@newsmax': 11,
'need': 1054,
'@drew95ca': 1,
'@jemoole': 1,
'@DavidAn53897256': 5,
'@ZelenskyyUa': 184,
'India': 207,
'show': 185,
'decency': 4,
'refuse': 111,
'Collabration': 1,
'way': 559,
'street': 51,
'@ECR_CoR': 1,
'grateful': 16,
'@eu_cor': 1,
'dedicated': 4,
'EU': 395,
'region': 96,
'urgent': 38,
'@EU_CoR': 1,
'voice': 48,
'grant': 21,
'candidate': 11,
'status': 20,
'sound.#stoprussianaggression': 1,
'russiainvadedukraine': 8,
'🔴': 21,
'breaking': 48,
' \n\n': 1,
'air': 155,
'alert': 23,
'kyiv': 121,
'mobilize': 7,
'civil': 48,
'team': 99,
'devilputin': 1,
'fuckputin': 6,
' ': 140,
'https://t.co/K1XQiiTHEr': 1,
'findyourthe': 1,
'redbubble': 1,
'@dagenmcdowell': 2,
'retirement': 4,
'America': 220,
'give': 382,
'big': 243,
'HELPFUL': 2,
'foreign': 112,
'PUTIN': 65,
'buy': 179,
'111': 4,
'barrel': 23,
'UKRAINE': 315,
'PEOPLE': 36,
'failure': 45,
'https://t.co/hzbthamn3f': 1,
'hunter': 8,
'deal': 115,
'continue': 242,
'https://t.co/yZyUNvqmsk': 1,
'War': 419,
'https://t.co/6iAHcwPQav': 1,
'see': 360,
'countless': 12,
'feature': 15,
'story': 108,
'Molotov': 14,
'cocktail': 19,
'scene': 25,
'classic': 2,
'nonviolent': 1,
'resistance': 66,
'rarely': 7,
'stunning': 6,
'effective': 25,
'tactic': 39,
'https://t.co/bf3odvmx2a': 1,
'sunflower': 31,
'national': 133,
'flower': 28,
'let': 510,
'start': 426,
'post': 196,
'ukranian': 47,
'cover': 90,
'Sunflowers': 4,
' \n': 54,
'Facebook': 19,
'surprise': 28,
'Zuck': 1,
'allow': 189,
'https://t.co/klweym3jzs': 1,
'worry': 72,
'fmr': 4,
'Zelenskyy': 75,
'advisor': 14,
'https://t.co/f3tqbathwv': 1,
'@msnbc': 6,
'@lubimayarussiya': 1,
'@aaronjmate': 22,
'@TrumpPres2017': 1,
'Donbas': 51,
'Cope': 1,
'teach': 26,
'history': 206,
'actively': 28,
'live': 384,
'State': 89,
'High': 11,
'student': 232,
'textbook': 2,
'tv': 105,
'screen': 14,
'inform': 18,
'https://t.co/ymBDkhWf7i': 1,
'@mint_4_ukraine': 1,
'perfect': 21,
'example': 72,
'powerful': 45,
'NFT': 43,
'work': 326,
'impact': 79,
'donate': 336,
'piece': 61,
'art': 35,
'https://t.co/8kqignoofx': 1,
'orange': 7,
'cone': 3,
'stop': 1001,
'snow': 5,
'clearing': 2,
'truck': 30,
'track': 31,
'Québec': 1,
'thousand': 126,
'embe': 1,
'grenade': 13,
'case': 107,
'tank': 169,
'thé': 1,
'respect': 83,
'deserve': 78,
'https://t.co/PjCuEYH9Wl': 1,
'@UptownComCapita': 1,
'@JackPosobiec': 14,
'peacekeeper': 9,
'@housegop': 1,
'FOUGHT': 1,
'DEMOCRACY': 5,
'HARD': 1,
'invader': 44,
'Jimmy': 6,
'Max': 9,
'yes': 266,
'Neo': 39,
'nazi': 227,
'Recap': 6,
'w/': 42,
'Blumenthal': 7,
'https://t.co/ek3prxnrwu': 1,
'@chloevtweet': 1,
'launder': 11,
'money': 371,
'got': 68,
'honey': 2,
'pot': 6,
'flow': 13,
'dry': 10,
'dime': 5,
'seize': 82,
'reconstruction': 12,
'cost': 76,
'bit': 76,
'🌻': 165,
'🙏': 322,
'https://t.co/5bwbzep5gm': 1,
'@burneroftaxe': 2,
'@NSStr0ng': 1,
'@GenuineNat': 2,
'major': 101,
'cite': 24,
'@MartinHeinrich': 10,
'weapon': 582,
'assistance': 199,
'defend': 385,
'innocent': 337,
'@potu': 238,
'provide': 306,
'safeairliftukraine': 138,
'StopPutin': 184,
'sir': 38,
'martin': 5,
'indifferent': 2,
'@Uncle_Joe_x': 1,
'@saracha45158427': 3,
'@nytimes': 62,
'Donbass': 39,
'Crimea': 117,
'anti': 215,
'instal': 31,
'puppet': 76,
'goverment': 6,
'https://t.co/T42aBbZ6tH': 1,
'@CivMilAir': 3,
'eye': 79,
'belarus': 2,
'supply': 218,
'shipment': 15,
'ready': 54,
'warn': 109,
'security': 117,
'@hopie93632267': 1,
'@lancesterling12': 1,
'12': 19,
'Palestinians': 30,
'learn': 103,
'human': 152,
'violent': 26,
'hatred': 18,
'Tampa': 1,
'newlywed': 1,
'urge': 64,
'speak': 184,
'https://t.co/wzlmmnc8og': 1,
'Trey': 4,
'Yingst': 1,
'@FoxNews': 44,
'explosion': 46,
'Kiyv': 5,
'Capitol': 15,
'@niii65919770': 1,
'@kotnikjanez': 1,
'@Caucasuswar': 14,
'lmao': 10,
'angle': 10,
'shit': 117,
'nazis': 26,
'pick': 49,
'looter': 8,
'subject': 23,
'stockade': 1,
'cling': 3,
'wrap': 14,
'utility': 2,
'pole': 8,
'Reddit': 3,
'https://t.co/Jyytei9MYh': 1,
'@TimInHonolulu': 9,
'@whnsc': 5,
'@DefenseIntel': 10,
'@DI_Ukraine': 10,
'use': 281,
'keep': 109,
'option': 39,
'@gopleader': 21,
'liberal': 32,
'open': 146,
'border.we': 1,
'sotuincrisi': 1,
'wonder': 115,
'Iranians': 12,
'bout': 8,
'kind': 83,
'pay': 202,
'tax': 22,
'dollar': 44,
'Crimes': 15,
'https://t.co/4ypjvajnzn': 1,
'sympathetic': 10,
'movement': 30,
'term': 84,
'origin': 7,
'root': 34,
'fault': 59,
'differently': 10,
'5': 136,
'effort': 98,
'play': 166,
'kingmaker': 1,
'euromaiden': 1,
'unmitigated': 1,
'disaster': 35,
'confirm': 75,
'Airdrop': 20,
'receive': 76,
'Crypto': 64,
'Donations': 32,
'https://t.co/bDGssQQCqA': 1,
'floodgate': 1,
'racism': 102,
'weird': 29,
'drunk': 5,
'wedding': 3,
'dgaf': 1,
'https://t.co/hbsolbzrw7': 1,
'@ameyaw112': 1,
'Wob3k': 1,
'anaa': 1,
'roman': 27,
'Abramovich': 91,
'billionaire': 37,
'Chelsea': 114,
'owner': 49,
'sell': 219,
'club': 69,
'https://t.co/ywwznjeotd': 1,
'House': 125,
'Resolution': 31,
'Montana': 17,
'Rep.': 30,
'Matt': 32,
'Rosendale': 46,
'Kentucky': 19,
'Thomas': 49,
'Massie': 65,
'Arizona': 26,
'Paul': 60,
'Gosar': 61,
'reply': 22,
'bank': 54,
'account': 105,
'suffer': 121,
'dysfunctional': 1,
'turmoil': 3,
'hardship': 8,
'killing': 23,
'nonsense': 31,
'🤷': 31,
'🏽\u200d': 6,
'♀': 26,
'️': 394,
'https://t.co/CUwfCXkMO0': 1,
'@MikhailFridman': 1,
'hear': 203,
'denounce': 38,
'power': 235,
'convince': 32,
'circle': 12,
'end': 394,
'totally': 49,
'unjustified': 6,
'pariah': 11,
'👉': 16,
'Fresh': 1,
'channel': 36,
'https://t.co/hwVsGexSMM': 1,
'News': 249,
'Dogecoin': 44,
'community': 103,
'53': 21,
'k': 54,
'country': 1533,
'hint': 21,
'upcoming': 18,
'airdrop': 45,
'\n\n ': 64,
'GEMs': 1,
'https://t.co/znaxtdiyaj': 1,
'onlygem': 1,
'btc': 9,
'eth': 10,
'bnb': 3,
'feg': 1,
'nft': 20,
'@BeattieDoug': 1,
'typical': 12,
'repulsive': 3,
'hypocrisy': 29,
'familiar': 10,
'moral': 52,
'equivalence': 4,
'campaign': 56,
'blood': 64,
'child': 318,
'hand': 168,
'atone': 1,
'official': 229,
'Kyiv': 243,
'loud': 26,
'beginning': 18,
'UkraineRussiaWar': 38,
'UkraineUnderAttack': 50,
'https://t.co/LfzipjKrhW': 1,
'syrian': 7,
'foul': 1,
'racist': 100,
'coverage': 67,
'CBC': 10,
'https://t.co/lnp9ibw3eu': 1,
'catch': 44,
'athlete': 20,
'sport': 31,
'organization': 54,
'respond': 48,
'violence': 55,
'https://t.co/dhDXYnXvCN': 1,
'@Benjami54553803': 3,
'@jamwood20': 2,
'@covid19_murder': 6,
'@unionjock1': 2,
'@Sniper_Wolf5': 7,
'americans': 6,
'fighting': 72,
'@jaycybersecuri1': 1,
'@auraalborn': 2,
'@astrosoul9': 3,
'@ADanielHill': 4,
'scam': 20,
'raise': 117,
'56,675': 1,
'asset': 92,
'donation': 166,
'enter': 68,
'giveaway': 1,
'certify': 3,
'government': 390,
'site': 42,
'hope': 348,
'cause': 193,
'❤': 181,
'instruction': 7,
'spirit': 28,
'https://t.co/hHp4KNSOj3': 1,
'@IAPonomarenko': 86,
'hero': 81,
'limb': 2,
'compliment': 2,
'@SenToddYoung': 2,
'commentary': 10,
'precise': 4,
'measure': 39,
'senator': 11,
'partisan': 11,
'dig': 16,
'bait': 11,
'criticism': 17,
'fare': 1,
'beneath': 3,
'dignity': 6,
'office': 52,
'https://t.co/mBxBJVj0KX': 1,
'@JeffSchogol': 1,
'Shit': 2,
'@chinahand': 1,
'decade': 80,
'heartless': 6,
'https://t.co/u5o90ieqsk': 1,
'@cranky_yankee': 1,
'Zelensky': 210,
'iconic': 4,
'movie': 25,
'life': 332,
'evacuate': 57,
'meme': 17,
'standing': 8,
'ovation': 10,
'damn': 40,
'@potus': 16,
'https://t.co/IRnsT6XLDY': 1,
'@HonNonsoNwankwo': 1,
'@transferchecker': 1,
'staying': 1,
'neutral': 70,
'favour': 21,
'Finland': 73,
'swiss': 4,
'soon': 156,
'later': 39,
...})
mar01 = mar01.iloc[:20000]
df = mar01
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19983 [00:00<?, ?it/s]
mar01 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
mar01['lemmas'] = mar01.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
mar01_word_counts = Counter(mar01.lemmas.sum())
feb28 = feb28.iloc[:20000]
df = feb28
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19977 [00:00<?, ?it/s]
feb28 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb28['lemmas'] = feb28.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
feb28_word_counts = Counter(feb28.lemmas.sum())
feb27 = feb27.iloc[:20000]
df = feb27
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19974 [00:00<?, ?it/s]
feb27 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb27['lemmas'] = feb27.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
feb27_word_counts = Counter(feb27.lemmas.sum())
feb26 = feb26.iloc[:20000]
df = feb26
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19979 [00:00<?, ?it/s]
feb26 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb26['lemmas'] = feb26.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
feb26_word_counts = Counter(feb26.lemmas.sum())
feb25 = feb25.iloc[:20000]
df = feb25
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19974 [00:00<?, ?it/s]
feb25 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb25['lemmas'] = feb25.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
feb25_word_counts = Counter(feb25.lemmas.sum())
feb24 = feb24.iloc[:20000]
df = feb24
df = df.loc[df['Language'] == 'en']
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/19978 [00:00<?, ?it/s]
feb24 = df
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb24['lemmas'] = feb24.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
feb24_word_counts = Counter(feb24.lemmas.sum())
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = mar02.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
mar02_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = mar01.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
mar01_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = feb28.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
feb28_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = feb27.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
feb27_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = feb26.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
feb26_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = feb25.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
feb25_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
not_interesting = {'the', '@', 'a', 'this'}
lemmas_ngrams = feb24.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])
feb24_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
all_word_counts = feb24_word_counts + feb25_word_counts + feb26_word_counts + feb27_word_counts + feb28_word_counts + mar01_word_counts + mar02_word_counts
all_word_counts.most_common(10)
[('Ukraine', 140474),
('Russia', 35135),
('Putin', 23274),
('war', 16895),
('russian', 16705),
('people', 16000),
('NATO', 10415),
('country', 10136),
('invasion', 8858),
('support', 8851)]
x = ['feb24', 'feb25', 'feb26', 'feb27', 'feb28', 'mar01', 'mar02']
counts_list = [feb24_word_counts, feb25_word_counts, feb26_word_counts, feb27_word_counts, feb28_word_counts, mar01_word_counts, mar02_word_counts ]
def plot_over_time(base, counts_list, start, stop):
figure(figsize=(16, 10))
for el in base.most_common()[start: stop]:
y=[]
for k in counts_list:
y.append(k.get(el[0]))
plt.plot(x, y, label = el[0] + " "+ str(el[1]))
plt.legend()
plt.show()
plot_over_time(all_word_counts, counts_list, 0, 10)
plot_over_time(all_word_counts, counts_list, 1, 11)
plot_over_time(all_word_counts, counts_list, 11, 21)
plot_over_time(all_word_counts, counts_list, 21, 31)
plot_over_time(all_word_counts, counts_list, 31, 41)
plot_over_time(all_word_counts, counts_list, 41, 51)
C:\Users\jakub\anaconda3\envs\WBII\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning:
Glyph 128591 (\N{PERSON WITH FOLDED HANDS}) missing from current font.
plot_over_time(feb24_word_counts, counts_list, 0, 10)
plot_over_time(mar02_word_counts, counts_list, 0, 10)
def plot_without_all(base, counts_list, start, stop, all_counts = all_word_counts):
figure(figsize=(16, 10))
a = all_counts.most_common(50)
for i,l in enumerate(a):#wybieramy tylko wyrazy
a[i] = l[0].lower()
b = base.most_common()
nb = []
for w in b:
if w[0].lower() not in a:
nb.append(w)
for el in nb[start: stop]:
y=[]
for k in counts_list:
y.append(k.get(el[0]))
plt.plot(x, y, label = el[0] + " "+ str(el[1]))
plt.legend()
plt.show()
plot_without_all(feb24_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(feb24_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(feb25_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(feb25_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(feb26_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(feb26_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(feb27_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(feb27_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(feb28_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(feb28_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(mar01_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(mar01_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
plot_without_all(mar02_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
plot_without_all(mar02_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
mar02_word_counts_ngrams = Counter(mar02_word_counts_ngrams)
mar01_word_counts_ngrams = Counter(mar01_word_counts_ngrams)
feb28_word_counts_ngrams = Counter(feb28_word_counts_ngrams)
feb27_word_counts_ngrams = Counter(feb27_word_counts_ngrams)
feb26_word_counts_ngrams = Counter(feb26_word_counts_ngrams)
feb25_word_counts_ngrams = Counter(feb25_word_counts_ngrams)
feb24_word_counts_ngrams = Counter(feb24_word_counts_ngrams)
all_word_counts_ngrams = mar02_word_counts_ngrams + mar01_word_counts_ngrams + feb28_word_counts_ngrams + feb27_word_counts_ngrams + feb26_word_counts_ngrams + feb25_word_counts_ngrams + feb24_word_counts_ngrams
x = ['feb24', 'feb25', 'feb26', 'feb27', 'feb28', 'mar01', 'mar02']
ngrams_list = [mar02_word_counts_ngrams , mar01_word_counts_ngrams , feb28_word_counts_ngrams , feb27_word_counts_ngrams , feb26_word_counts_ngrams , feb25_word_counts_ngrams, feb24_word_counts_ngrams]
def plot_over_time(base, counts_list, start, stop):
figure(figsize=(16, 10))
for el in base.most_common()[start: stop]:
y=[]
for k in counts_list:
y.append(k.get(el[0]))
plt.plot(x, y, label = el[0] + " "+ str(el[1]))
plt.legend()
plt.show()
plot_over_time(all_word_counts_ngrams, ngrams_list, 0, 10)
plot_over_time(all_word_counts_ngrams, ngrams_list, 10, 20)
plot_over_time(all_word_counts_ngrams, ngrams_list, 20, 30)
plot_over_time(all_word_counts_ngrams, ngrams_list, 30, 40)
27 lutego wiadomość o starlinkach dla Ukrainy
plot_over_time(all_word_counts_ngrams, ngrams_list, 40, 50)
def plot_without_all_ngrams(base, counts_list, start, stop, all_counts = all_word_counts_ngrams):
figure(figsize=(16, 10))
a = all_counts.most_common(50)
for i,l in enumerate(a):#wybieramy tylko wyrazy
a[i] = l[0].lower()
b = base.most_common()
nb = []
for w in b:
if w[0].lower() not in a:
nb.append(w)
for el in nb[start: stop]:
y=[]
for k in counts_list:
y.append(k.get(el[0]))
plt.plot(x, y, label = el[0] + " "+ str(el[1]))
plt.legend()
plt.show()
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 0, 10)
C:\Users\jakub\anaconda3\envs\WBII\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning:
Glyph 128591 (\N{PERSON WITH FOLDED HANDS}) missing from current font.
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 20, 30)
plot_without_all_ngrams(feb25_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(feb25_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(feb26_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(feb26_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(feb27_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(feb27_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(feb28_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(feb28_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(mar01_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(mar01_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 0 ,10)
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 10, 20)
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 20, 30)
df = df.loc[df['Language'] == 'en']
48001*7-len(df)
428
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply: 0%| | 0/335579 [00:00<?, ?it/s]
df_copy = df
def cloud_from_lemmas(word_counts):
wc = WordCloud(width=800, height=400)
wc.generate_from_frequencies(frequencies=word_counts)
plt.figure(figsize=(10,8))
plt.imshow(wc)
def plot_counts(counts):
fig = px.bar(counts,orientation='h', y='word', x='count')
fig['layout']['yaxis']['autorange'] = "reversed"
fig.update_layout(bargap=0.30, font={'size':10})
return fig
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
df['lemmas'] = df.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
doc_lens = df["Text_en"].str.len()
doc_lens.hist(log_y=True)
fig, ax = plt.subplots(figsize=(19, 13))
ax.boxplot(doc_lens)
plt.show()
word_counts = Counter(df.lemmas.sum())
counts = pd.DataFrame(Counter({k: v for k, v in word_counts.items()}).most_common(60), columns=['word', 'count'])